This file loads produces Figures and runs analysis using the real data.
library(tidyverse)
library(VectraPolarisData)
library(ggridges)
library(MASS)
library(pscl)
library(patchwork)
library(viridis)
library(data.table)
library(Rtsne)
library(Rphenograph)
knitr::opts_chunk$set(echo = TRUE,
warning = FALSE,
message = FALSE,
fig.width = 9,
fig.height = 4,
fig.path = './figs/'
)
theme_set(theme_bw() + theme(legend.position = "bottom"))
The data is publicly available on Bioconductor as the package
VectraPolarisData. You can install it here:
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
BiocManager::install("VectraPolarisData")
Converting the ovarian cancer dataset from a
SpatialExperiment object to a dataframe.
library(VectraPolarisData)
spe_ovarian <- HumanOvarianCancerVP()
## Assays slots
assays_slot <- assays(spe_ovarian)
intensities_df <- assays_slot$intensities
nucleus_intensities_df<- assays_slot$nucleus_intensities
rownames(nucleus_intensities_df) <- paste0("nucleus_", rownames(nucleus_intensities_df))
membrane_intensities_df<- assays_slot$membrane_intensities
rownames(membrane_intensities_df) <- paste0("membrane_", rownames(membrane_intensities_df))
# colData and spatialData
colData_df <- colData(spe_ovarian)
spatialCoords_df <- spatialCoords(spe_ovarian)
# clinical data
patient_level_ovarian <- metadata(spe_ovarian)$clinical_data %>%
# create binary stage variable
mutate(stage_bin = ifelse(stage %in% c("1", "2"), 0, 1))
cell_level_ovarian <- as.data.frame(cbind(colData_df,
spatialCoords_df,
t(intensities_df),
t(nucleus_intensities_df),
t(membrane_intensities_df))
) %>%
dplyr::rename(cd19 = cd19_opal_480,
cd68 = cd68_opal_520,
cd3 = cd3_opal_540,
cd8 = cd8_opal_650,
ier3 = ier3_opal_620,
pstat3 = p_stat3_opal_570,
ck = ck_opal_780,
ki67 = ki67_opal_690)
# data frame with clinical characteristics where each row is a different cell
#ovarian_df <- full_join(patient_level_ovarian, cell_level_ovarian, by = "sample_id")
rm(spe_ovarian, assays_slot, intensities_df, nucleus_intensities_df, membrane_intensities_df, colData_df, spatialCoords_df)
spe_lung <- HumanLungCancerV3()
## Assays slots
assays_slot <- assays(spe_lung)
intensities_df <- assays_slot$intensities
nucleus_intensities_df<- assays_slot$nucleus_intensities
rownames(nucleus_intensities_df) <- paste0("nucleus_", rownames(nucleus_intensities_df))
membrane_intensities_df<- assays_slot$membrane_intensities
rownames(membrane_intensities_df) <- paste0("membrane_", rownames(membrane_intensities_df))
# colData and spatialData
colData_df <- colData(spe_lung)
spatialCoords_df <- spatialCoords(spe_lung)
# clinical data
patient_level_lung <- metadata(spe_lung)$clinical_data
cell_level_lung <- as_tibble(cbind(colData_df,
spatialCoords_df,
t(intensities_df),
t(nucleus_intensities_df),
t(membrane_intensities_df))
) %>%
dplyr::rename(cd19 = cd19_opal_650,
cd3 = cd3_opal_520,
cd14 = cd14_opal_540,
cd8 = cd8_opal_620,
hladr = hladr_opal_690,
ck = ck_opal_570)
# data frame with clinical characteristics where each row is a different cell
#lung_df <- full_join(patient_level_lung, cell_level_lung, by = "slide_id")
rm(spe_lung, assays_slot, intensities_df, nucleus_intensities_df, membrane_intensities_df, colData_df, spatialCoords_df)
As an illustrative example will use the mxnorm package
to normalize the lung data.
mx_dataset objectUse slide_id as slide identifier - Use
sample_id as image identifier - Use the following marker
columns: - cd19 - cd14 - cd3 -
cd8 - hladr - ck -
dapi - Use tissue_category as a metadata
column
library(mxnorm)
mx_data = mx_dataset(data = cell_level_lung,
slide_id = "slide_id",
image_id = "sample_id",
marker_cols = c("cd19",
"cd3",
"cd14",
"cd8",
"hladr",
"ck",
"dapi"
),
metadata_cols = c("tissue_category", "phenotype_ck", "phenotype_cd8", "phenotype_cd14",
"phenotype_other", "phenotype_cd19"))
summary(mx_data)
## Call:
## `mx_dataset` object with 153 slide(s), 7 marker column(s), and 6 metadata column(s)
And let’s look at the data object we’re going to use going forward:
knitr::kable(head(mx_data$data)) %>%
kableExtra::kable_styling() %>% kableExtra::scroll_box(width = "100%")
| slide_id | sample_id | cd19 | cd3 | cd14 | cd8 | hladr | ck | dapi | tissue_category | phenotype_ck | phenotype_cd8 | phenotype_cd14 | phenotype_other | phenotype_cd19 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| #01 0-889-121 | #01 0-889-121 P44_[40864,18015].im3 | 0.201 | 0.025 | 0.155 | 0.132 | 0.115 | 1.024 | 5.166 | Tumor | CK- | CD8- | CD14- | Other- | CD19- |
| #01 0-889-121 | #01 0-889-121 P44_[40864,18015].im3 | 0.546 | 0.172 | 1.799 | 0.821 | 0.239 | 0.721 | 7.477 | Stroma | CK- | CD8- | CD14- | Other- | CD19- |
| #01 0-889-121 | #01 0-889-121 P44_[40864,18015].im3 | 1.388 | 0.060 | 0.173 | 0.989 | 0.268 | 0.391 | 8.237 | Stroma | CK- | CD8- | CD14- | Other- | CD19- |
| #01 0-889-121 | #01 0-889-121 P44_[40864,18015].im3 | 0.683 | 0.004 | 1.136 | 0.174 | 0.245 | 0.468 | 3.786 | Stroma | CK- | CD8- | CD14- | Other- | CD19- |
| #01 0-889-121 | #01 0-889-121 P44_[40864,18015].im3 | 0.176 | 0.034 | 0.176 | 0.191 | 0.127 | 1.608 | 6.055 | Tumor | CK+ | CD8- | CD14- | Other- | CD19- |
| #01 0-889-121 | #01 0-889-121 P44_[40864,18015].im3 | 0.179 | 0.052 | 0.227 | 0.227 | 0.136 | 1.147 | 12.131 | Tumor | CK+ | CD8- | CD14- | Other- | CD19- |
mx_normalize()Normalize using two methods: log10, and
mean_divide. Then, calculate Otsu discordance metrics.
mx_norm = mx_normalize(mx_data,
transform = "log10_mean_divide",
method = "None")
# Otsu
mx_norm = run_otsu_discordance(mx_norm,
table="both",
thresold_override=NULL,
plot_out = FALSE)
# umap
mx_norm = run_reduce_umap(mx_norm,
table="both",
marker_list = mx_data$marker_cols,
downsample_pct = .1,
metadata_cols = mx_data$metadata_cols)
Summary table that calculates the metrics from the Bioinformatics paper is printed below
# calculate metrics for unnormalized and normalized data
summary(mx_norm)
## Call:
## `mx_dataset` object with 153 slide(s), 7 marker column(s), and 6 metadata column(s)
##
## Normalization:
## Data normalized with transformation=`log10_mean_divide` and method=`None`
##
## Anderson-Darling tests:
## table mean_test_statistic mean_std_test_statistic mean_p_value
## normalized 478.249 34.932 0
## raw 1064.311 97.682 0
##
## Threshold discordance scores:
## table mean_discordance sd_discordance
## normalized 0.039 0.048
## raw 0.098 0.103
##
## Clustering consistency (UMAP):
## table adj_rand_index cohens_kappa
## normalized 0.007 0.000
## raw 0.010 -0.001
First, density plots. Only showing density plots for unnormalized cells for two types of immune markers and 3 subjects.
set.seed(103001)
ids = sample(unique(cell_level_lung$slide_id), 3)
markers = c("cd19", "cd14")
mx_df = mx_data$data %>%
pivot_longer(cd19:dapi, names_to = "marker", values_to = "marker_value") %>%
filter(slide_id %in% ids)
cd14 = mx_df %>%
filter(marker %in% c("cd14")) %>%
ggplot() +
geom_line(stat = "density", aes(marker_value, group = sample_id, color = slide_id),
alpha=0.5, linetype = 2) +
geom_density(aes(marker_value, group = slide_id,
color = slide_id), size = 1.25) +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~marker, scales = "free", ncol = 1) +
theme(legend.position = "none") +
xlim(0, 1) +
labs(x = "")
cd19 = mx_df %>%
filter(marker %in% c("cd19")) %>%
ggplot() +
geom_line(stat = "density", aes(marker_value, group = sample_id, color = slide_id),
alpha=0.5, linetype = 2) +
geom_density(aes(marker_value, group = slide_id,
color = slide_id), size = 1.25) +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~marker, scales = "free", ncol = 1) +
theme(legend.position = "none") +
xlim(0, 0.5) +
labs(x = "marker value")
Discordance plots. Show for subset of markers and maybe subset of subjects.
set.seed(103001)
ids = sample(unique(cell_level_lung$slide_id), 15)
otsu_data = mx_norm$otsu_data %>%
filter(marker %in% c("cd14", "cd19", "cd8", "cd3", "dapi", "ck"),
slide_id %in% ids) %>%
mutate(slide_id = as.numeric(factor(slide_id)),
norm = factor(table, levels = c("raw", "normalized")))
point_size = 2
mean_vals = otsu_data %>%
group_by(norm, slide_id) %>%
summarize(m1 = mean(discordance_score), .groups = "drop")
disc = otsu_data %>%
ggplot() +
geom_point(aes(discordance_score, factor(slide_id), color = marker), size = point_size) +
facet_wrap(~ norm) +
geom_point(data = mean_vals, aes(m1, slide_id), color = "black", fill = "white",
shape = 23, size = point_size) +
labs(x = "discordance score", y = "slide id") +
theme(legend.position = c(.9, .7))
Putting the plots together
(cd14 / cd19) / disc + plot_layout(heights = c(1, 1, 2.5))
rm(mx_data, mean_vals, otsu_data, discordance_score, ids, markers, point_size, mx_df)
Add in phenograph plots. Clustering using marker values for a single subject. First using unnormalized marker values.
unnorm_df = mx_norm$data %>%
filter(slide_id == "#01 0-889-121") %>%
dplyr::select(cd19, cd3, cd14, cd8, ck, contains("phenotype")) %>%
mutate(phenotype = case_when(
phenotype_cd14 == "CD14+" ~ "CD14+",
phenotype_cd19 == "CD19+" ~ "CD19+",
phenotype_cd8 == "CD8+" ~ "CD8+",
phenotype_ck == "CK+" ~ "CK+",
TRUE ~ "other"
))
unnorm_df = na.omit(unnorm_df)
unnorm_df_mat = as.matrix(unnorm_df[, 1:5])
# tsne
tsne_results = Rtsne(unnorm_df_mat, dims = 2,check_duplicates = FALSE)
unnorm_df$tsne1 = tsne_results$Y[,1]
unnorm_df$tsne2 = tsne_results$Y[,2]
# phenograph
phen_results = Rphenograph(unnorm_df_mat, k = 100)
## Finding nearest neighbors...DONE ~ 0.372 s
## Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 46.734 s
## Build undirected graph from the weighted links...DONE ~ 2.785 s
## Run louvain clustering on the graph ...DONE ~ 6.541 s
## Return a community class
## -Modularity value: 0.851991
## -Number of clusters: 16
unnorm_df$cluster <- factor(membership(phen_results[[2]]))
Now using normalized marker values
norm_df = mx_norm$norm_data %>%
filter(slide_id == "#01 0-889-121") %>%
dplyr::select(cd19, cd3, cd14, cd8, ck, contains("phenotype"))
norm_df = na.omit(norm_df)
norm_df_mat = as.matrix(norm_df[, 1:5])
# tsne
#tsne_results = Rtsne(norm_df_mat, dims = 2,check_duplicates = FALSE)
norm_df$tsne1 = tsne_results$Y[,1]
norm_df$tsne2 = tsne_results$Y[,2]
# phenograph
phen_results = Rphenograph(norm_df_mat, k = 100)
## Finding nearest neighbors...DONE ~ 0.558 s
## Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 45.479 s
## Build undirected graph from the weighted links...DONE ~ 2.733 s
## Run louvain clustering on the graph ...DONE ~ 5.872 s
## Return a community class
## -Modularity value: 0.7828095
## -Number of clusters: 14
norm_df$cluster <- factor(membership(phen_results[[2]]))
Plotted results for unnormalized phenograph labels, normalized phenograph labels, and Inform labels. Results are plotted for single subject with 5` ROIs.
p1 = unnorm_df %>%
ggplot(aes(tsne1, tsne2, color = phenotype)) +
geom_point(alpha = 0.5, size = 0.75) +
labs(x = "TSNE 1", y = "TSNE 2", title = "Inform") +
theme(legend.title=element_blank(), legend.position = "right")
p2 = unnorm_df %>%
ggplot(aes(tsne1, tsne2, color = cluster)) +
geom_point(alpha = 0.5, size = 0.75) +
labs(x = "TSNE 1", y = "", title = "Phenograph, unnormalized") +
guides(fill=guide_legend(title="New Legend Title")) +
theme(legend.position = "right")
p3 = norm_df %>%
ggplot(aes(tsne1, tsne2, color = cluster)) +
geom_point(alpha = 0.5, size = 0.75) +
labs(x = "TSNE 1", y = "", title = "Phenograph, normalized") +
theme(legend.position = "right")
p1 / p2 / p3
Data is currently in cell-level format (each row is a cell). To make it easier for modeling counts and proportions, going to summarize the dataset so that each row is a subject and cell types are reported as counts.
Look at only tumor areas.
# aggregate to cell counts for each subject
ovarian_counts = cell_level_ovarian %>%
group_by(sample_id, tissue_category) %>%
summarize(total_cells = n(),
cd68_positive_cells = length(phenotype_cd68[phenotype_cd68 == "CD68+"]),
ki67_positive_cells = length(phenotype_ki67[phenotype_ki67 == "Ki67+"]),
ck_positive_cells = length(phenotype_ck[phenotype_ck == "CK+"]),
cd19_positive_cells = length(phenotype_cd19[phenotype_cd19 == "CD19+"]),
pstat3_positive_cells =
length(phenotype_p_stat3[phenotype_p_stat3 == "pStat3+"]),
cd3_positive_cells = length(phenotype_cd3[phenotype_cd3 == "CD3+"]),
cd8_positive_cells = length(phenotype_cd8[phenotype_cd8 == "CD8+"]),
cd3plus_cd8plus_cells = length(phenotype_cd8[phenotype_cd8 == "CD8+" &
phenotype_cd3 == "CD3+"])
) %>%
ungroup() %>%
dplyr::select(sample_id, tissue_category, total_cells, cd68_positive_cells, cd19_positive_cells, cd3_positive_cells,
cd8_positive_cells) %>%
filter(tissue_category != "Glass") %>%
pivot_longer(cd68_positive_cells:cd8_positive_cells, names_to = "cell_type", values_to = "count") %>%
mutate(proportion = count / total_cells,
cell_type = factor(cell_type, levels = c("cd3_positive_cells", "cd8_positive_cells",
"cd19_positive_cells", "cd68_positive_cells"),
labels = c("CD4 T cells", "CD8 T cells", "B cells", "Macrophages")))
ovarian_counts = full_join(dplyr::select(patient_level_ovarian, sample_id, stage_bin, BRCA_mutation,
primary, age_at_diagnosis),
ovarian_counts, by = "sample_id") %>%
filter(proportion < 0.75,
tissue_category == "Tumor") %>%
mutate(stage_bin = factor(stage_bin, levels = 0:1, labels = c("Stage I/II", "Stage III/IV")),
BRCA_mutation = factor(BRCA_mutation, levels = 0:1, labels = c("BRCA-", "BRCA+")))
# by brca mutation
ovarian_counts %>%
filter(!is.na(BRCA_mutation)) %>%
ggplot(aes(proportion,cell_type, fill = cell_type, height = stat(density))) +
geom_density_ridges(stat = "binline", bins = 15, scale = .95, draw_baseline = FALSE) +
scale_y_discrete(expand = expansion(add = c(0.05, 0.4))) +
facet_wrap(~ BRCA_mutation) +
geom_vline(xintercept = 0, linetype = 2) +
theme(legend.position = "none") +
labs(x = "proportion of cells in tumor area", y = "cell type")
Calculate raw proportions across cell types:
summary_stats = ovarian_counts %>%
group_by(cell_type) %>%
summarize(model = "naive",
proportion = mean(proportion),
lower = proportion - 1.96 * sqrt( (proportion * (1-proportion))/132),
upper = proportion + 1.96 * sqrt( (proportion * (1-proportion))/132)
) %>%
ungroup() %>%
dplyr::select(cell_type, model, proportion, lower, upper)
Model the proportions as a function of BRCA mutation.
# define function to extract proportions and standard deviations for each cell type
get_proportions = function(model, model_name){
if(model_name %in% c("ZIP","ZINB") ){
ci = broom:::broom_confint_terms(model) %>%
filter(grepl("BRCA", term)) %>%
mutate(lower = exp(conf.low), upper = exp(conf.high),
term = str_remove(term, "count_"))
df = left_join(as_tibble(summary(model)$coefficients$count, rownames = "term"),
ci) %>%
janitor::clean_names() %>%
filter(grepl("BRCA", term)) %>%
mutate(BRCA = exp(estimate), p = pr_z,
cell_type = c("CD4 T cells", "CD8 T cells", "B cells", "Macrophages"),
model = model_name) %>%
dplyr::select(cell_type, model, BRCA, lower, upper, p)
return(df)
}
broom::tidy(model, exp = TRUE, conf.int = TRUE) %>%
filter(grepl("BRCA", term)) %>%
mutate(cell_type = c("CD4 T cells", "CD8 T cells", "B cells", "Macrophages"),
model = model_name,
p = p.value) %>%
dplyr::select(cell_type, model, BRCA = estimate, lower = conf.low, upper = conf.high, p)
}
# naive model (square root transform)
# poisson regression
mod_pois = glm(count ~ cell_type * BRCA_mutation + offset(log(total_cells)),
family = "poisson",
data = ovarian_counts)
prop_pois = get_proportions(mod_pois, "Poisson")
# logistic regression
mod_bin = glm(cbind(count, total_cells - count) ~ cell_type * BRCA_mutation,
family = binomial,
data = ovarian_counts)
prop_bin = get_proportions(mod_bin, "binomial")
# negative binomial regression
mod_negbin = glm.nb(count ~ cell_type * BRCA_mutation + offset(log(total_cells)),
data = ovarian_counts)
prop_negbin = get_proportions(mod_negbin, "negbinomial")
# zero inflated poisson?
mod_zeroinfl <- zeroinfl(count ~ cell_type * BRCA_mutation + offset(log(total_cells)) | 1,
dist = "poisson",
data = ovarian_counts)
prop_zeroPois = get_proportions(mod_zeroinfl, "ZIP")
#model = mod_zeroinfl
#model_name = "ZIP"
# zero inflated negbin
mod_zeroNegBin <- zeroinfl(count ~ cell_type * BRCA_mutation + offset(log(total_cells)) | 1,
dist = "negbin",
data = ovarian_counts)
prop_zeroNegBin = get_proportions(mod_zeroNegBin, "ZINB")
## combine results
# make some other more informative and prettier plots. These p-values are super significant.
bind_rows(naive, prop_pois, prop_bin, prop_negbin,prop_zeroPois, prop_zeroNegBin) %>%
#filter(model != "naive") %>%
mutate(model = factor(model, levels = c("naive", "binomial", "Poisson", "negbinomial", "ZIP", "ZINB"))) %>%
ggplot(aes(model, BRCA, color = model, shape = model)) +
geom_point(position = position_dodge(width = 0.3), size = .8) +
geom_errorbar(aes(ymin = lower, ymax = upper), position = position_dodge(width = 0.3), width = .1) +
#ylim(-.1, 0.075) +
theme_minimal() +
facet_wrap(~cell_type) +
labs(x = "")
Use ovarian data, calculate K function. Show picture with high clustering, low clustering, poor quality. Use radius from Ben’s paper. Look at bivariate K as well and show association with survival.